#! /usr/bin/env python

"""This module hold functions that help in diagnosing and fixing problems in 
systems related to MRImpvrov (not the system itself). This includes problems
related to the HDFS
"""

import ma.const
import ma.net.netutils

import sys
import getopt
import os.path
import re
import subprocess


namespaceIDstr = 'namespaceID='


def check_fix_hdfs_namespace_id():
    """Fixes namespace-ids across all nodes - http://issues.apache.org/jira/browse/HDFS-107.
    This function is supposed to be run on a machine which has the namenode. It
    will first identify the namespace id which is currently being used by the
    namenode and then synchronize it with all the data nodes  
    """
    
    # print the function introduction
    print("""Lets try to diagnose the HDFS problem of namespace ID invalidation;  
this is a common error and might prevent you starting all the 
data-nodes. Refer to http://issues.apache.org/jira/browse/HDFS-107.

For this function to run correctly, it needs to be run on the node
which runs the namenode.

It will first find the namespace id which is currently being used 
by the namenode and then synchronize it with all the data nodes
""")
    print(getNamenodeNamespaceID())
    
    # get the current hadoop path
    hdfs_conf_xml_path = ma.const.XmlData.get_filepath_str_data(ma.const.xml_hdfs_conf_path)
    hdfs_conf_xml_str = open(hdfs_conf_xml_path, 'r').read()
    
    # sed s/^namespaceID=.*$/namespaceID=12321/g /state/partition1/hadoop-data/dfs-data/current/VERSION
     
    proc = subprocess.Popen('cat %s' % hdfs_conf_xml_path, shell=True, stdout=subprocess.PIPE)
    # get the stdout
    proc_out = proc.communicate()[0]
    hdfs_conf_xml_str = str(proc_out, encoding='utf8', errors='replace')
    
    # get the re to find the hdfs namenode path
    hdfs_name_dir_re = ma.const.XmlData.get_filepath_str_data(ma.const.xml_hdfs_name_dir_re)
    hdfs_name_dir_re = re.compile(hdfs_name_dir_re)
    namen_matches = re.findall(hdfs_name_dir_re, hdfs_conf_xml_str)
    
    # get the re to find the hdfs datanode path
    hdfs_data_dir_re = ma.const.XmlData.get_filepath_str_data(ma.const.xml_hdfs_data_dir_re)
    hdfs_data_dir_re = re.compile(hdfs_data_dir_re)
    datan_matches = re.findall(hdfs_data_dir_re, hdfs_conf_xml_str)
    
    # get the relatvie filepath for the VERSION file
    hdfs_ver_rel_filepath = ma.const.XmlData.get_filepath_str_data(ma.const.xml_hdfs_ver_relative_filepath)
    
    # if found a match in the xml get the dir
    if namen_matches and datan_matches:
        # get the filepath of the relevant files
        hdfs_name_dir = namen_matches[0]
        hdfs_data_dir = datan_matches[0]
        hdfs_name_fp = os.path.join(hdfs_name_dir, hdfs_ver_rel_filepath)
        hdfs_data_fp = os.path.join(hdfs_data_dir, hdfs_ver_rel_filepath)
        
        print(hdfs_name_fp)
        print(hdfs_data_fp)
    else:
        sys.stderr.write('\nERROR: HDFS namenode and/or datanode path could not be found - check if they are defined in %s\n' % hdfs_conf_xml_path)
        sys.exit(2)


def getNamenodeNamespaceID():
    """This function opens the namenode VERSION file and gets ths current
    namespaceID
    """
    # get the current hadoop path
    hdfs_conf_xml_path = ma.const.XmlData.get_filepath_str_data(ma.const.xml_hdfs_conf_path)
    hdfs_conf_xml_str = open(hdfs_conf_xml_path, 'r').read()
    
    # get the re to find the hdfs namenode path
    hdfs_name_dir_re = ma.const.XmlData.get_filepath_str_data(ma.const.xml_hdfs_name_dir_re)
    hdfs_name_dir_re = re.compile(hdfs_name_dir_re)
    namen_matches = re.findall(hdfs_name_dir_re, hdfs_conf_xml_str)
    
    # get the relatvie filepath for the VERSION file
    hdfs_ver_rel_filepath = ma.const.XmlData.get_filepath_str_data(ma.const.xml_hdfs_ver_relative_filepath)
    
    # if found a match in the xml get the dir
    if namen_matches:
        # get the filepath of the relevant files
        hdfs_name_dir = namen_matches[0]
        hdfs_name_ver_fp = os.path.join(hdfs_name_dir, hdfs_ver_rel_filepath)
        
        if os.path.exists(hdfs_name_ver_fp):
            exec_cmd = 'cat %s | egrep %s' % (hdfs_name_ver_fp, namespaceIDstr)
            proc = subprocess.Popen(exec_cmd, shell=True, stdout=subprocess.PIPE)
            # get the stdout
            proc_out = proc.communicate()[0]
            nmspceid_str = str(proc_out, encoding='utf8', errors='replace')
            id = re.findall(r'(\d+)', nmspceid_str)
            if id:
                return id[0]
            else:
                return None
        else:
            sys.stderr.write('\nERROR: Cannot find the namenode VERSION file - %s\n' % hdfs_name_ver_fp)
            sys.exit(2)
    else:
        sys.stderr.write('\nERROR: HDFS namenode and/or datanode path could not be found - check if they are defined in %s\n' % hdfs_conf_xml_path)
        sys.exit(2)    


def usage_help():
    print("""This function helps in diagnosing and fixing problems in the
systems that help running MRImprov. This includes the Hadoop DFS.

Usage: python distrdebug.py COMMAND
where COMMAND is one of:
  -h | --help\t\t\tdisplays this help
  -d | --dfs <options>\t\tdiagnose and fix specific problems in the DFS. Following are the valid <options>
  \t\t\tnamespaceid - Fixes namespace-ids across all nodes - http://issues.apache.org/jira/browse/HDFS-107 
""")


# main function / starting point for diagnostic script
if __name__ == "__main__":
    print("""MR+ External programs Diagnostics
---------------------------------""")
    try:
        if len(sys.argv) <= 1:
            raise getopt.GetoptError('No arguments given')
        opts, args = getopt.getopt(sys.argv[1:], "hd:", ["help", "dfs="])
    except getopt.GetoptError:
        print("---Invalid usage---")
        usage_help()
        sys.exit(2)
    
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage_help()
            sys.exit()
        elif opt in ("-d", "--dfs"):
            if arg == 'namespaceid':
                check_fix_hdfs_namespace_id()
            else:
                usage_help()
                sys.exit()
                
    